Day 1: Sequence analysis using TraMineR - EDA

Day 2: Sequence analysis using TraMineR - Matching, clustering

Day 3: Association rule mining using priori algorithm

Loading libaries

library(readr)
library(ggplot2)
library(data.table)
library(TraMineR)
library(dplyr)
library(tidyr)
library(TraMineR)
data(mvad)
seqstatl(mvad[, 17:86])
[1] "employment"  "FE"          "HE"          "joblessness" "school"      "training"   
mvad.alphabet <- c("employment", "FE", "HE", "joblessness", "school", 
    "training")
mvad.labels <- c("employment", "further education", "higher education", 
    "joblessness", "school", "training")
mvad.scodes <- c("EM", "FE", "HE", "JL", "SC", "TR")
mvad.seq <- seqdef(mvad, 17:86, alphabet = mvad.alphabet, states = mvad.scodes, 
    labels = mvad.labels, xtstep = 6)
 [>] state coding:
       [alphabet]  [label]  [long label] 
     1  employment  EM       employment
     2  FE          FE       further education
     3  HE          HE       higher education
     4  joblessness JL       joblessness
     5  school      SC       school
     6  training    TR       training
 [>] 712 sequences in the data set
 [>] min/max sequence length: 70/70
seqiplot(mvad.seq, with.legend = FALSE, title= "Index plot (10 first sequences")
 [!!] In seqiplot() : title is deprecated, use main instead.

seqfplot(mvad.seq, withlegend = F, title = "Sequence frequency plot bar width proportional to the frequencies")
 [!!] In seqfplot() : title is deprecated, use main instead.
 [!!] In seqfplot() : withlegend is deprecated, use with.legend instead.

 seqdplot(mvad.seq, withlegend = F,  title = "State distribution plot")
 [!!] In seqdplot() : title is deprecated, use main instead.
 [!!] In seqdplot() : withlegend is deprecated, use with.legend instead.

Import log data

lasi21_logdata <- read_csv("lasi21_logdata.csv")

── Column specification ─────────────────────────────────────────────────────────────────────
cols(
  id = col_double(),
  sas_id_site = col_double(),
  site_type = col_character(),
  date_time = col_datetime(format = ""),
  spent_time = col_double(),
  action = col_character(),
  instancename = col_character(),
  `avg score` = col_double(),
  PassFlag = col_double()
)
lasi21_logdata$site_type <- ifelse(grepl("ssignment", lasi21_logdata$instancename), "assignment",lasi21_logdata$site_type )
lasi21_logdata$site_type <- ifelse(grepl("exam", lasi21_logdata$instancename), "assignment",lasi21_logdata$site_type )

lasi21_logdata <- as.data.table(lasi21_logdata[,c("id","date_time","spent_time","site_type","instancename","PassFlag","avg score")])
head(lasi21_logdata)

Data pre-processing

# convert ms to minutes
lasi21_logdata$spent_time_m <- round(lasi21_logdata$spent_time/60000, digits=1)

# filter out all spent_time < 6s
lasi21_logdata2 <- lasi21_logdata %>% filter(spent_time>6000)

# create a flag for session break (the first click per student)
lasi21_logdata2$session_flag = 0
lasi21_logdata2 <- lasi21_logdata2 %>% arrange(id,date_time,spent_time) %>% 
    group_by(id) %>%
    mutate(session_flag  = +(row_number() %in% 1))

# create a flag for session break (aka where time spent > 30 minutes)
lasi21_logdata2$session_flag <- ifelse(lasi21_logdata2$spent_time_m>30, 1, lasi21_logdata2$session_flag)

# create session number 
lasi21_logdata2$session_num = cumsum(lasi21_logdata2$session_flag)

# remove session break
lasi21_logdata2 <- as.data.table(lasi21_logdata2)
lasi21_logdata2 <- lasi21_logdata2[session_flag!=1,]


# for each learning session, calculate the cummulative time spent
lasi21_logdata2 <- lasi21_logdata2 %>% 
                        arrange(id,date_time,spent_time) %>% 
                        group_by(session_num) %>% 
                        mutate(spent_time_m_cum = cumsum(spent_time_m))

# create time unit as 1/10 of a minute (6s)
lasi21_logdata2$time_unit <- round(lasi21_logdata2$spent_time_m_cum*10,digits=0)

lasi21_logdata_session <- lasi21_logdata2 %>% group_by(session_num) %>% top_n(1, spent_time_m_cum)

hist(lasi21_logdata_session$spent_time_m_cum, main="Learning session length", xlab = "Minutes", breaks=200)


# Remove learning session < 5 mins
lasi21_logdata2 <- merge(lasi21_logdata2,lasi21_logdata_session[lasi21_logdata_session$spent_time_m_cum>5 & lasi21_logdata_session$spent_time_m_cum<120, c("session_num")])

Transform data into STS format

# create a function to expand time unit for each learning session

f1 <- function(x1){
    x1 <- 1:max(x1)
    m1 <- max(c(length(x1)))
    length(x1) <- m1
    list(time_unit = x1)
}

# create a subset
lasi21_logdata3 <- lasi21_logdata2[,c("session_num","time_unit","site_type")]

# create an expanded df
test <- setDT(lasi21_logdata3)[, f1(time_unit), .(session_num)]

# merge with log data
test <- merge(test,lasi21_logdata3,by=c("session_num","time_unit"),all.x=TRUE)

# fill missing upward
test <- test %>% fill(site_type,.direction = "up")
# Sequence length distribution
hist(lasi21_logdata3[,max(time_unit), by="session_num"]$V1, breaks=100, main="Length of learning session", xlab='Sequence length')

# cummulative plot of seq length
plot(ecdf(lasi21_logdata3[,max(time_unit), by="session_num"]$V1), main ='Cummulative distribution of seq length')

Define sequences

# define sequences columns
log_sts.seq <- seqdef(log_sts,2:1200)
 [>] found missing values ('NA') in sequence data
 [>] preparing 18602 sequences
 [>] coding void elements with '%' and missing values with '*'
 [>] 12 distinct states appear in the data: 
     1 = assignment
     2 = collaborate
     3 = content
     4 = forumng
     5 = glossary
     6 = homepage
     7 = questionnaire
     8 = resource
     9 = studio
     10 = subpage
     11 = url
     12 = wiki
 [>] state coding:
       [alphabet]    [label]       [long label] 
     1  assignment    assignment    assignment
     2  collaborate   collaborate   collaborate
     3  content       content       content
     4  forumng       forumng       forumng
     5  glossary      glossary      glossary
     6  homepage      homepage      homepage
     7  questionnaire questionnaire questionnaire
     8  resource      resource      resource
     9  studio        studio        studio
     10  subpage       subpage       subpage
     11  url           url           url
     12  wiki          wiki          wiki
 [>] 18602 sequences in the data set
 [>] min/max sequence length: 51/1199

EDA

# plot the first 10 sequences with length=100
layout(matrix(c(1,1,1,2), nrow=1, byrow=TRUE))
seqiplot(log_sts.seq[611:631,1:100], with.legend = F, main = "Index plot (10 first sequences)")
seqlegend(log_sts.seq)

# State distribution plot
layout(matrix(c(1,1,1,2), nrow=1, byrow=TRUE))
seqdplot(log_sts.seq[,1:200], main = "State distribution plot", with.legend = FALSE)
seqlegend(log_sts.seq)

# Sequence frequency plot
layout(matrix(c(1,1,1,2), nrow=1, byrow=TRUE))
seqfplot(log_sts.seq[,1:200], main = "Sequence frequency plot", with.legend = FALSE, pbarw = TRUE)
seqlegend(log_sts.seq)

# mean time by state
layout(matrix(c(1,1,1,2), nrow=1, byrow=TRUE))
seqmtplot(log_sts.seq[,1:200], main = "Mean time plot", with.legend = FALSE)
seqlegend(log_sts.seq)

# Stability within sequences
# Shannon's entropy as a measure of the diversity of states observed at the considered time point
# It equals 0 when all cases are in the same state  (it is thus easy to predict in which state an individual is)
# It is maximum when the cases are equally distributed between the states
seqHtplot(log_sts.seq[,1:200], title = "Entropy index")
 [!!] In seqHtplot() : title is deprecated, use main instead.

# Turbulence
Turbulence <- seqST(log_sts.seq[,1:200])
summary(Turbulence)
   Turbulence    
 Min.   : 1.000  
 1st Qu.: 2.000  
 Median : 4.038  
 Mean   : 4.704  
 3rd Qu.: 6.641  
 Max.   :32.312  
hist(Turbulence, col = "cyan", main = "Sequence turbulence",breaks=50)

# Transitions of events

# define seq transitions
log_sts.seqe <- seqecreate(log_sts.seq[,1:200])

# find frequent subsequences
fsubseq <- seqefsub(log_sts.seqe, pMinSupport = 0.05)
 [!!] In seqefsub() : pMinSupport is deprecated, use pmin.support instead.
# plot 15 most frequent subsquences
plot(fsubseq[1:15], col = "cyan", main="Top 15 frequent subsequences")

Sequence similarities and clustering

# Compute sequence distances using OM with transition rate as substitution cost
log_sts.seq.om1 <- round(seqdist(log_sts.seq[1:1000,1:200], method = "OM", indel = 1, sm = "CONSTANT"),2)
 [>] 1000 sequences with 12 distinct states
 [>] Computing sm with seqcost using  CONSTANT
 [>] creating 12x12 substitution-cost matrix using 2 as constant value
 [>] 846 distinct  sequences 
 [>] min/max sequence lengths: 51/200
 [>] computing distances using the OM metric
 [>] elapsed time: 28.993 secs

fviz_nbclust(log_sts.seq.om1, FUN = hcut, method = "silhouette")+
labs(subtitle = "Silhouette method")

cluster3 <- cutree(clusterward, k = 3)
cluster3 <- factor(HC3c, labels = paste("Type", 1:3))
table(cluster3)
cluster3
Type 1 Type 2 Type 3 
   442    291    267 
# frequency plot of sequences by cluster
seqfplot(log_sts.seq[1:1000,1:200], group = cluster3, pbarw = T)

seqmtplot(log_sts.seq[1:1000,1:200], group = cluster3)

# layout(matrix(c(1,1,1,2), nrow=1, byrow=TRUE))
seqdplot(log_sts.seq[1:1000,1:200], group = cluster3, border = NA, with.legend = T)

# seqlegend(log_sts.seq)
covar$grade <- rnorm(1000, mean=70, sd=10); covar$grade <- covar$grade[covar$grade > 1 & covar$grade < 100]
Error in `$<-.data.frame`(`*tmp*`, grade, value = c(72.1121413037559,  : 
  replacement has 998 rows, data has 1000
reglog <- list()
for (i in 1:length(levels(cluster3))) {
reglog[[i]] <- glm((cluster3 == levels(cluster3)[i]) ~ condition + grade + surveymetric1 + surveymetric2, family = "binomial", data = covar)}
tbls <- list()
for (i in 1:length(levels(cluster3))) {tbls[[i]] <- tbl_regression(reglog[[i]], exponentiate = TRUE)}

tbl_merge(
    tbls = tbls,
    tab_spanner = c("**Type 1**", "**Type 2**", "**Type 3**")
  )
Characteristic Type 1 Type 2 Type 3
OR1 95% CI1 p-value OR1 95% CI1 p-value OR1 95% CI1 p-value
condition
Control — — — — — —
Treatment 0.90 0.70, 1.16 0.4 1.15 0.87, 1.51 0.3 0.99 0.75, 1.31 >0.9
grade 1.00 0.99, 1.02 0.5 0.99 0.98, 1.01 0.4 1.00 0.99, 1.02 0.8
surveymetric1 0.99 0.71, 1.36 >0.9 0.87 0.61, 1.24 0.4 1.18 0.82, 1.69 0.4
surveymetric2 0.98 0.80, 1.19 0.8 0.99 0.80, 1.23 >0.9 1.04 0.83, 1.30 0.7

1 OR = Odds Ratio, CI = Confidence Interval

Comparison of sequences

---
title: "Temporal and sequential analysis for learning analytics"
author: "Quan Nguyen, Postdoctoral Fellow, School of Information, University of Michigan"
output: html_notebook
---


# Day 1: Sequence analysis using TraMineR - EDA
# Day 2: Sequence analysis using TraMineR - Matching, clustering
# Day 3: Association rule mining using priori algorithm

## Loading libaries

```{r}
library(readr)
library(ggplot2)
library(data.table)
library(TraMineR)
library(dplyr)
library(tidyr)
```

```{r}
library(TraMineR)
data(mvad)
seqstatl(mvad[, 17:86])
mvad.alphabet <- c("employment", "FE", "HE", "joblessness", "school", 
    "training")
mvad.labels <- c("employment", "further education", "higher education", 
    "joblessness", "school", "training")
mvad.scodes <- c("EM", "FE", "HE", "JL", "SC", "TR")
mvad.seq <- seqdef(mvad, 17:86, alphabet = mvad.alphabet, states = mvad.scodes, 
    labels = mvad.labels, xtstep = 6)

```
```{r}
head(mvad)
```

```{r}
seqiplot(mvad.seq, with.legend = FALSE, title= "Index plot (10 first sequences")
```
```{r}
seqIplot(mvad.seq, sortv = "from.start", with.legend = FALSE)
```
```{r}
seqfplot(mvad.seq, withlegend = F, title = "Sequence frequency plot bar width proportional to the frequencies")
```
```{r}
 seqdplot(mvad.seq, withlegend = F,  title = "State distribution plot")
```


# Import log data

```{r}
lasi21_logdata <- read_csv("lasi21_logdata.csv")
lasi21_logdata$site_type <- ifelse(grepl("ssignment", lasi21_logdata$instancename), "assignment",lasi21_logdata$site_type )
lasi21_logdata$site_type <- ifelse(grepl("exam", lasi21_logdata$instancename), "assignment",lasi21_logdata$site_type )

lasi21_logdata <- as.data.table(lasi21_logdata[,c("id","date_time","spent_time","site_type","instancename","PassFlag","avg score")])
head(lasi21_logdata)
```

# Data pre-processing
```{r}
# convert ms to minutes
lasi21_logdata$spent_time_m <- round(lasi21_logdata$spent_time/60000, digits=1)

# filter out all spent_time < 6s
lasi21_logdata2 <- lasi21_logdata %>% filter(spent_time>6000)

# create a flag for session break (the first click per student)
lasi21_logdata2$session_flag = 0
lasi21_logdata2 <- lasi21_logdata2 %>% arrange(id,date_time,spent_time) %>% 
    group_by(id) %>%
    mutate(session_flag  = +(row_number() %in% 1))

# create a flag for session break (aka where time spent > 30 minutes)
lasi21_logdata2$session_flag <- ifelse(lasi21_logdata2$spent_time_m>30, 1, lasi21_logdata2$session_flag)

# create session number 
lasi21_logdata2$session_num = cumsum(lasi21_logdata2$session_flag)

# remove session break
lasi21_logdata2 <- as.data.table(lasi21_logdata2)
lasi21_logdata2 <- lasi21_logdata2[session_flag!=1,]


# for each learning session, calculate the cummulative time spent
lasi21_logdata2 <- lasi21_logdata2 %>% 
                        arrange(id,date_time,spent_time) %>% 
                        group_by(session_num) %>% 
                        mutate(spent_time_m_cum = cumsum(spent_time_m))

# create time unit as 1/10 of a minute (6s)
lasi21_logdata2$time_unit <- round(lasi21_logdata2$spent_time_m_cum*10,digits=0)

lasi21_logdata_session <- lasi21_logdata2 %>% group_by(session_num) %>% top_n(1, spent_time_m_cum)

hist(lasi21_logdata_session$spent_time_m_cum, main="Learning session length", xlab = "Minutes", breaks=200)

# Remove learning session < 5 mins
lasi21_logdata2 <- merge(lasi21_logdata2,lasi21_logdata_session[lasi21_logdata_session$spent_time_m_cum>5 & lasi21_logdata_session$spent_time_m_cum<120, c("session_num")])

```

# Transform data into STS format
```{r}
# create a function to expand time unit for each learning session

f1 <- function(x1){
    x1 <- 1:max(x1)
    m1 <- max(c(length(x1)))
    length(x1) <- m1
    list(time_unit = x1)
}

# create a subset
lasi21_logdata3 <- lasi21_logdata2[,c("session_num","time_unit","site_type")]

# create an expanded df
test <- setDT(lasi21_logdata3)[, f1(time_unit), .(session_num)]

# merge with log data
test <- merge(test,lasi21_logdata3,by=c("session_num","time_unit"),all.x=TRUE)

# fill missing upward
test <- test %>% fill(site_type,.direction = "up")
```



```{r}
# Sequence length distribution
hist(lasi21_logdata3[,max(time_unit), by="session_num"]$V1, breaks=100, main="Length of learning session", xlab='Sequence length')
```

```{r}
# cummulative plot of seq length
plot(ecdf(lasi21_logdata3[,max(time_unit), by="session_num"]$V1), main ='Cummulative distribution of seq length')
```
# Define sequences

```{r}
# convert to STS format (wide format)
log_sts <- spread(test, time_unit, site_type)

# define sequences columns
log_sts.seq <- seqdef(log_sts,2:1200)

```

# EDA

```{r}
# plot the first 10 sequences with length=100
layout(matrix(c(1,1,1,2), nrow=1, byrow=TRUE))
seqiplot(log_sts.seq[611:631,1:100], with.legend = F, main = "Index plot (10 first sequences)")
seqlegend(log_sts.seq)
```
```{r}
# Plot 200 sequences sorted by LCS (longest common subsequence) distance
dist.mostfreq <- seqdist(log_sts.seq, method = "LCS", refseq = 0)
seqIplot(log_sts.seq[1:200,1:200], border = NA, sortv = dist.mostfreq, with.legend=F)
```


```{r}
# State distribution plot
layout(matrix(c(1,1,1,2), nrow=1, byrow=TRUE))
seqdplot(log_sts.seq[,1:200], main = "State distribution plot", with.legend = FALSE)
seqlegend(log_sts.seq)
```

```{r}
# Sequence frequency plot
layout(matrix(c(1,1,1,2), nrow=1, byrow=TRUE))
seqfplot(log_sts.seq[,1:200], main = "Sequence frequency plot", with.legend = FALSE, pbarw = TRUE)
seqlegend(log_sts.seq)
```
```{r}
# mean time by state
layout(matrix(c(1,1,1,2), nrow=1, byrow=TRUE))
seqmtplot(log_sts.seq[,1:200], main = "Mean time plot", with.legend = FALSE)
seqlegend(log_sts.seq)
```

```{r}
# Stability within sequences
# Shannon's entropy as a measure of the diversity of states observed at the considered time point
# It equals 0 when all cases are in the same state  (it is thus easy to predict in which state an individual is)
# It is maximum when the cases are equally distributed between the states
seqHtplot(log_sts.seq[,1:200], title = "Entropy index")
```

```{r}
# Turbulence
Turbulence <- seqST(log_sts.seq[,1:200])
summary(Turbulence)
hist(Turbulence, col = "cyan", main = "Sequence turbulence",breaks=50)

```

```{r}
# Transitions of events

# define seq transitions
log_sts.seqe <- seqecreate(log_sts.seq[,1:200])

# find frequent subsequences
fsubseq <- seqefsub(log_sts.seqe, pMinSupport = 0.05)

# plot 15 most frequent subsquences
plot(fsubseq[1:15], col = "cyan", main="Top 15 frequent subsequences")

```

# Sequence similarities and clustering

```{r}
# Compute sequence distances using OM with transition rate as substitution cost
log_sts.seq.om1 <- round(seqdist(log_sts.seq[1:1000,1:200], method = "OM", indel = 1, sm = "CONSTANT"),2)
```
```{r}
# apply agglomerate hierarchical clustering
library(cluster)
clusterward <- agnes(log_sts.seq.om1, diss = TRUE, method = "ward")
hcd <- as.dendrogram(clusterward)
plot(hcd, which.plots = 2, main="Dendogram of sequences clustering", leaflab = "none")
```
```{r}
library(factoextra)
# Plot scree plot using wss
fviz_nbclust(log_sts.seq.om1, FUN = hcut, method = "wss")
```
```{r}
# Plot Silhouette 
fviz_nbclust(log_sts.seq.om1, FUN = hcut, method = "silhouette")+
labs(subtitle = "Silhouette method")
```
```{r}
# Use k=3 as the number of cluster
cluster3 <- cutree(clusterward, k = 3)
cluster3 <- factor(HC3c, labels = paste("Type", 1:3))
table(cluster3)
```
```{r}
# frequency plot of sequences by cluster
seqfplot(log_sts.seq[1:1000,1:200], group = cluster3, pbarw = T)
```

```{r}
# mean time plot of states by cluster membership
seqmtplot(log_sts.seq[1:1000,1:200], group = cluster3)
```
```{r}
# layout(matrix(c(1,1,1,2), nrow=1, byrow=TRUE))
seqdplot(log_sts.seq[1:1000,1:200], group = cluster3, border = NA, with.legend = T)
# seqlegend(log_sts.seq)

```

```{r}
# made up covariates for 1000 sequences (e.g., condition, grade, survey_metric1, survey_metric2)
covar <- data.frame(matrix(ncol = 4, nrow = 1000))
colnames(covar) <- c("condition", "grade", "surveymetric1", "surveymetric2")

set.seed(123)
covar$condition <- sample(x=c("Control","Treatment"),1000,prob=c(0.5,0.5),replace=T)

set.seed(123)
covar$grade <- round(rnorm(1000, mean=60, sd=10),0)

set.seed(123)
covar$surveymetric1 <- sample(x=1:5, prob = c(0.1,0.2,0.4,0.2,0.1), replace = T)

set.seed(123)
covar$surveymetric2 <- sample(x=1:5, prob = c(0.1,0.2,0.3,0.3,0.1), replace = T)
```

```{r}
library(gtsummary)

# run logistic regression for each cluster of sequences
reglog <- list()
for (i in 1:length(levels(cluster3))) {reglog[[i]] <- glm((cluster3 == levels(cluster3)[i]) ~ 
                                                            condition + grade + surveymetric1 + surveymetric2, 
                                                            family = "binomial", 
                                                            data = covar)}
# create nice summary output using gtsummary package 
# http://www.danieldsjoberg.com/gtsummary/articles/tbl_regression.html
tbls <- list()
for (i in 1:length(levels(cluster3))) {tbls[[i]] <- tbl_regression(reglog[[i]], exponentiate = TRUE)}

tbl_merge(
    tbls = tbls,
    tab_spanner = c("**Type 1**", "**Type 2**", "**Type 3**")
  )
```

# Comparison of sequences

```{r}
# Plot sequences by group
dist.refseq <- seqdist(log_sts.seq[1:1000,1:200], refseq = 0, method = "LCS")

seqIplot(log_sts.seq[1:1000,1:200], group = covar$condition, sortv = dist.refseq, with.legend = "right")
```



